library(tidyverse)
library(cowplot)
################################################################################
# Setup
################################################################################
rm(list=ls())
# - set dir
args = commandArgs()

scriptName = args[substr(args,1,7) == '--file=']

if (length(scriptName) == 0) {
  scriptName <- rstudioapi::getSourceEditorContext()$path
} else {
  scriptName <- substr(scriptName, 8, nchar(scriptName))
}

pathName = substr(
  scriptName, 
  1, 
  nchar(scriptName) - nchar(strsplit(scriptName, '.*[/|\\]')[[1]][2])
)

setwd(pathName)
################################################################################
# Analysis
################################################################################

d0 <- read_csv("../data/analysis_data_ChatGPT_results.csv")
print(d0, n = 100)

d <- d0 %>%
	pivot_longer(-c(Task_Classes, Task, Subject, Type, Year, Annotators), names_to = "Metric", values_to = "Value") %>%
	mutate(Metric = case_when(Metric == "ICA" ~ "Intercoder Agreement", TRUE ~ "Accuracy")) %>%
	mutate(Value = Value/100) %>%
	mutate(Task_Long = paste0(Task, "\n(", Type, " ", Year, ")", "\n(", Subject, ")")) %>%
	mutate(Sample = paste0(Type, " (",  Subject, ", ", Year, ")")) %>%
	mutate(Sample = fct_relevel(Sample, c(
		"Tweets (Content Moderation, 2021)", 
		"Tweets (Content Moderation, 2023)",
		"News Articles (Content Moderation, 2021)",
		"Tweets (US Congress, 2017-2022)"))) %>%
	mutate(Task = fct_relevel(Task, rev(c(
		"Relevance",
		"Stance",
		"Topics",
		"Frames I",
		"Frames II")))) %>%
	mutate(Annotators = fct_relevel(Annotators, c("Trained annotators", "MTurk", "ChatGPT (temp 1)", "ChatGPT (temp 0.2)")))


g_accuracy <- ggplot(data = subset(d, Metric == "Accuracy" & Annotators != "Trained annotators")) +
	aes(y = Task, x = Value, group = desc(Annotators), fill = Annotators) +
	geom_col(position = position_dodge2(), width = 0.75, color = "black", linewidth = 0.1) +
	scale_x_continuous(labels = scales::percent) +
	scale_fill_brewer(type = "seq", palette = "OrRd", direction = 1) +
	facet_wrap(~ Sample, scales = "free_y") +
	labs(x = "", y = "") +
	theme_light() +
	theme(strip.background = element_blank(), strip.text = element_text(color = "black"), legend.title = element_blank())

ggsave("../img/plot_ChatGPT_v2_accuracy.png", g_accuracy, width = 9, height = 6)
ggsave("../img/plot_ChatGPT_v2_accuracy.pdf", g_accuracy, width = 9, height = 6)

g_ica <- ggplot(data = subset(d, Metric == "Intercoder Agreement")) +
	aes(y = Task, x = Value, group = desc(Annotators), fill = Annotators) +
	geom_col(position = position_dodge2(), width = 0.75, color = "black", linewidth = 0.1) +
	scale_x_continuous(labels = scales::percent) +
	scale_fill_brewer(type = "seq", palette = "OrRd", direction = 1) +
	facet_wrap(~ Sample, scales = "free_y") +
	labs(x = "", y = "") +
	theme_light() +
	theme(strip.background = element_blank(), strip.text = element_text(color = "black"), legend.title = element_blank())

ggsave("../img/plot_ChatGPT_v2_ica.png", g_ica, width = 9, height = 6)
ggsave("../img/plot_ChatGPT_v2_ica.pdf", g_ica, width = 9, height = 6)


g_tweets_cm_2021 <- ggplot(data = subset(d, Sample == "Tweets (Content Moderation, 2021)")) +
	aes(y = Task, x = Value, group = desc(Annotators), fill = Annotators) +
	geom_col(position = position_dodge2(), width = 0.75, color = "black", linewidth = 0.1) +
	scale_x_continuous(labels = scales::percent) +
	scale_fill_brewer(type = "seq", palette = "OrRd", direction = 1) +
	facet_wrap(~ Metric) +
	labs(x = "", y = "", title = "A. Tweets (2020-2021)") +
	theme_light() +
	theme(strip.background = element_blank(), strip.text = element_text(color = "black", size = 11, face = "italic"), legend.title = element_blank(), legend.text = element_text(size = 10.5), panel.spacing.x = unit(1, "lines"), axis.text = element_text(size = 11), title = element_text(face = "bold"), plot.margin = unit(c(0,0.35,0,0), "cm"))

g_tweets_cm_2023 <- ggplot(data = subset(d, Sample == "Tweets (Content Moderation, 2023)")) +
	aes(y = Task, x = Value, group = desc(Annotators), fill = Annotators) +
	geom_col(position = position_dodge2(), width = 0.75, color = "black", linewidth = 0.1) +
	scale_x_continuous(labels = scales::percent) +
	scale_fill_brewer(type = "seq", palette = "OrRd", direction = 1) +
	facet_wrap(~ Metric) +
	labs(x = "", y = "", title = "C. Tweets (2023)") +
	theme_light() +
	theme(strip.background = element_blank(), strip.text = element_text(color = "black", size = 11, face = "italic"), legend.title = element_blank(), legend.text = element_text(size = 10.5), panel.spacing.x = unit(1, "lines"), axis.text = element_text(size = 11), title = element_text(face = "bold"), plot.margin = unit(c(0,0.35,0,0), "cm"))

g_news_cm <- ggplot(data = subset(d, Sample == "News Articles (Content Moderation, 2021)")) +
	aes(y = Task, x = Value, group = desc(Annotators), fill = Annotators) +
	geom_col(position = position_dodge2(), width = 0.75, color = "black", linewidth = 0.1) +
	scale_x_continuous(labels = scales::percent) +
	scale_fill_brewer(type = "seq", palette = "OrRd", direction = 1) +
	facet_wrap(~ Metric) +
	labs(x = "", y = "", title = "B. News Articles (2020-2021)") +
	theme_light() +
	theme(strip.background = element_blank(), strip.text = element_text(color = "black", size = 11, face = "italic"), legend.title = element_blank(), legend.text = element_text(size = 10.5), panel.spacing.x = unit(1, "lines"), axis.text = element_text(size = 11), title = element_text(face = "bold"), plot.margin = unit(c(0,0.35,0,0), "cm"))

g_tweets_congress <- ggplot(data = subset(d, Sample == "Tweets (US Congress, 2017-2022)")) +
	aes(y = Task, x = Value, group = desc(Annotators), fill = Annotators) +
	geom_col(position = position_dodge2(), width = 0.75, color = "black", linewidth = 0.1) +
	scale_x_continuous(labels = scales::percent) +
	scale_fill_brewer(type = "seq", palette = "OrRd", direction = 1) +
	facet_wrap(~ Metric) +
	labs(x = "", y = "", title = "D. Tweets (2017-2022)") +
	theme_light() +
	theme(strip.background = element_blank(), strip.text = element_text(color = "black", size = 11, face = "italic"), legend.title = element_blank(), legend.text = element_text(size = 10.5), panel.spacing.x = unit(1, "lines"), axis.text = element_text(size = 11), title = element_text(face = "bold"), plot.margin = unit(c(0,0.35,0,0), "cm"))

legend <- get_legend(
	g_tweets_cm_2021 +
	theme(legend.position = "bottom")
)

pgrid <- plot_grid(
	g_tweets_cm_2021 + theme(legend.position="none"),
	g_news_cm + theme(legend.position="none"),
	g_tweets_cm_2023 + theme(legend.position="none"),
	g_tweets_congress + theme(legend.position="none")
)

g <- plot_grid(
	pgrid,
	legend,
	ncol = 1,
	rel_heights = c(1, 0.1)
)

ggsave("../img/plot_ChatGPT_v2_combined.pdf", g, width = 10, height = 6)


# Average outperformance

d2 <- d %>%
	filter(Annotators %in% c("ChatGPT (temp 1)", "ChatGPT (temp 0.2)", "MTurk") & Metric == "Accuracy") %>%
	group_by(Annotators) %>%
	summarize(Mean_Accuracy = mean(Value))

d2[2,2]/d2[1,2] # 1.630853


# Average Accuracy ChatGPT

d %>%
	filter(Annotators == "ChatGPT (temp 0.2)" & Metric == "Accuracy" & Task == "Relevance") %>%
	select(Task, Year, Type, Subject, Value)
# # A tibble: 4 × 5
#   Task      Year      Type          Subject            Value
#   <fct>     <chr>     <chr>         <chr>              <dbl>
# 1 Relevance 2021      News Articles Content Moderation 0.808
# 2 Relevance 2021      Tweets        Content Moderation 0.702
# 3 Relevance 2023      Tweets        Content Moderation 0.588
# 4 Relevance 2017-2022 Tweets        US Congress        0.828


# Average ICA ChatGPT

d %>%
	filter(Metric == "Intercoder Agreement") %>%
	select(Task, Year, Type, Subject, Annotators, Value) %>%
	group_by(Annotators) %>%
	summarize(Mean_Accuracy = mean(Value))
# # A tibble: 4 × 2
#   Annotators         Mean_Accuracy
#   <fct>                      <dbl>
# 1 Trained annotators         0.793
# 2 MTurk                      0.557
# 3 ChatGPT (temp 1)           0.905
# 4 ChatGPT (temp 0.2)         0.973

## Correlation accuracy / ICA

d1 <- subset(d0, !(Annotators %in% c("MTurk", "Trained annotators")))
cor(d1$Accuracy, d1$ICA)

d0

d3 <- d0 %>%
	filter(Annotators %in% c("ChatGPT (temp 0.2)", "Trained annotators")) %>%
	pivot_wider(values_from = c(Accuracy, ICA), names_from = Annotators)
cor(d3$`ICA_Trained annotators`, d3$`Accuracy_ChatGPT (temp 0.2)`)


d4 <- d0 %>%
	filter(Annotators %in% c("ChatGPT (temp 0.2)", "MTurk", "Trained annotators")) %>%
	pivot_wider(values_from = c(Accuracy, ICA), names_from = Annotators) %>%
	mutate(Overperformance_ChatGPT = `Accuracy_ChatGPT (temp 0.2)` - Accuracy_MTurk)

cor(d4$`ICA_Trained annotators`, d4$Overperformance_ChatGPT)


